#data cleanup
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
## 
##     extract
library(stringr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
#Visualization
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggplot2)
library(DT)

#Data
library(bea.R)
## Creating a generic function for 'toJSON' from package 'jsonlite' in package 'googleVis'
## Note: As of February 2018, beaGet() requires 'TableName' for NIPA and NIUnderlyingDetail data instead of 'TableID.' See https://github.us-bea/bea.R for details.
library(devtools)
## Loading required package: usethis
## Error in get(genname, envir = envir) : 找不到对象'testthat_print'
library(gtrendsR)

#Text Analysis
library(tidytext)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)

#Forecasting
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:data.table':
## 
##     first, last
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(forecast)
library(tseries)
library(prophet)
## Loading required package: Rcpp
## Loading required package: rlang
## 
## Attaching package: 'rlang'
## The following object is masked from 'package:data.table':
## 
##     :=
## The following object is masked from 'package:magrittr':
## 
##     set_names

data cleaning

# load the dataset
library(tidyverse)
## ─ Attaching packages ─────────────────────────────────────────────── tidyverse 1.3.0 ─
## ✓ tibble  3.0.1     ✓ purrr   0.3.4
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ─ Conflicts ──────────────────────────────────────────────── tidyverse_conflicts() ─
## x rlang:::=()              masks data.table:::=()
## x purrr::%@%()             masks rlang::%@%()
## x purrr::as_function()     masks rlang::as_function()
## x lubridate::as.difftime() masks base::as.difftime()
## x data.table::between()    masks dplyr::between()
## x lubridate::date()        masks base::date()
## x magrittr::extract()      masks tidyr::extract()
## x plotly::filter()         masks dplyr::filter(), stats::filter()
## x xts::first()             masks data.table::first(), dplyr::first()
## x purrr::flatten()         masks rlang::flatten()
## x purrr::flatten_chr()     masks rlang::flatten_chr()
## x purrr::flatten_dbl()     masks rlang::flatten_dbl()
## x purrr::flatten_int()     masks rlang::flatten_int()
## x purrr::flatten_lgl()     masks rlang::flatten_lgl()
## x purrr::flatten_raw()     masks rlang::flatten_raw()
## x data.table::hour()       masks lubridate::hour()
## x lubridate::intersect()   masks base::intersect()
## x purrr::invoke()          masks rlang::invoke()
## x data.table::isoweek()    masks lubridate::isoweek()
## x dplyr::lag()             masks stats::lag()
## x xts::last()              masks data.table::last(), dplyr::last()
## x purrr::list_along()      masks rlang::list_along()
## x data.table::mday()       masks lubridate::mday()
## x data.table::minute()     masks lubridate::minute()
## x purrr::modify()          masks rlang::modify()
## x data.table::month()      masks lubridate::month()
## x purrr::prepend()         masks rlang::prepend()
## x data.table::quarter()    masks lubridate::quarter()
## x data.table::second()     masks lubridate::second()
## x purrr::set_names()       masks rlang::set_names(), magrittr::set_names()
## x lubridate::setdiff()     masks base::setdiff()
## x purrr::splice()          masks rlang::splice()
## x purrr::transpose()       masks data.table::transpose()
## x lubridate::union()       masks base::union()
## x data.table::wday()       masks lubridate::wday()
## x data.table::week()       masks lubridate::week()
## x data.table::yday()       masks lubridate::yday()
## x data.table::year()       masks lubridate::year()
stock = read.csv("BitcoinStockMarketHistoricalData_2014_2021.csv", stringsAsFactors=FALSE)

stock$Date = as.Date(stock$Date, format =  "%m/%d/%Y")
stock$Open = as.numeric(stock$Open)
## Warning: 强制改变过程中产生了NA
stock$High = as.numeric(stock$High)
## Warning: 强制改变过程中产生了NA
stock$Low = as.numeric(stock$Low)
## Warning: 强制改变过程中产生了NA
stock$Close = as.numeric(stock$Close)
## Warning: 强制改变过程中产生了NA
stock$Adj.Close = as.numeric(stock$Adj.Close)
## Warning: 强制改变过程中产生了NA
stock$Volume = as.numeric(stock$Volume)
## Warning: 强制改变过程中产生了NA
# remove missing values
stock = na.omit(stock)
sum(is.na(stock))
## [1] 0
head(stock,6)
##         Date    Open    High     Low   Close Adj.Close   Volume
## 1 2014-09-17 465.864 468.174 452.422 457.334   457.334 21056800
## 2 2014-09-18 456.860 456.860 413.104 424.440   424.440 34483200
## 3 2014-09-19 424.103 427.835 384.532 394.796   394.796 37919700
## 4 2014-09-20 394.673 423.296 389.883 408.904   408.904 36863600
## 5 2014-09-21 408.085 412.426 393.181 398.821   398.821 26580100
## 6 2014-09-22 399.100 406.916 397.130 402.152   402.152 24127600

Visualization

p1 <- stock %>%
  plot_ly(x = ~Date,
          type = "candlestick", 
          open = ~Open, 
          close = ~Close, 
          high = ~High,
          low = ~Low,
          name = "price") %>%
  layout(
    xaxis = list(
      rangeselector = list(
        buttons = list(
          list(
            count = 1,
            label = "1 mo",
            step = "week",
            stepmode = "backward"),
          list(
            count = 3,
            label = "3 mo",
            step = "month",
            stepmode = "backward"),
          list(
            count = 6,
            label = "6 mo",
            step = "month",
            stepmode = "backward"),
          list(
            count = 1,
            label = "1 yr",
            step = "year",
            stepmode = "backward"),
          list(
            count = 3,
            label = "3 yr",
            step = "year",
            stepmode = "backward"),
          list(step = "all"))),
      rangeslider = list(visible = FALSE)),
         yaxis = list(title = "Price ($)",
                      showgrid = TRUE,
                      showticklabels = TRUE))
p2 <- stock %>%
  plot_ly(x=~Date, y=~Volume, type='bar', name = "Volume") %>%
  layout(yaxis = list(title = "Volume"))

p <- subplot(p1, p2, heights = c(0.7,0.3), nrows=2,
             shareX = TRUE, titleY = TRUE) %>%
  layout(title = "Bitcoin")
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
p

Initial Analysis

bit_ts = stock %>%
  filter(Date > as.Date('2017-01-01')) %>%
  arrange(Date) %>%
  select(Adj.Close) %>%
  as.matrix() %>%
  ts()

my_theme = theme(panel.grid = element_line(color = '#e6e6e6'),
                 panel.background = element_rect(fill = 'white'),
                 plot.title = element_text(hjust = .5, size = 28, colour = '#ffa500'),
                 text = element_text(family = 'Georgia'),
                 axis.text = element_text(size = 10),
                 axis.title = element_text(size = 18, family = 'Georgia', face = 'bold'),
                 axis.line = element_line(colour = '#737373', size = 1),
                 strip.background = element_rect(colour = "black", fill = "white"),
                 strip.text = element_text(face = 'bold'))  

gglagplot(bit_ts, do.lines = F) + my_theme +
         scale_color_continuous(low = "#b37400", high = "#ffc04d", breaks = c(1, 366, 731, 1096, 1462), labels = c('2017', '2018', '2019','2020','2021')) + 
  scale_y_continuous(breaks = c(0, 15000, 30000, 45000), 
                     labels = c('$0', '$15,000', '$30,000', '$45,000')) +
  scale_x_continuous(breaks = c(30000, 60000), 
                     labels = c('$30,000', '$60,000'))

ggAcf(bit_ts, lag.max = 200) + my_theme + labs(title = 'ACF' , y = 'Correlation')

ggPacf(bit_ts, lag.max = 200) + my_theme + labs(title = 'PACF', y = '')

ggAcf(diff(bit_ts), lag.max = 200) + my_theme + labs(title = 'ACF' , y = 'Correlation') 

ggPacf(diff(bit_ts), lag.max = 200) + my_theme + labs(title = 'PACF', y = '')

The First Difference

cut_bit_df = stock %>%
  filter(Date > as.Date('2017-01-01'))

ggplotly(cut_bit_df[-1,] %>%
  mutate(Price = diff(cut_bit_df$Adj.Close)) %>%
  ggplot(aes(Date, Price)) + geom_line(col = '#ffa500') + my_theme + 
  labs(x = '', title = 'Bitcoin Differenced By One', y = 'Difference'))

transformation

BoxCox.lambda(bit_ts)
## [1] -0.0806947
# before transformation
ggplotly(stock %>%
           filter(Date >= as.Date('2017-01-01')) %>% ggplot(aes(Date, Adj.Close)) + geom_line(col = '#ffa500') + 
  labs(title = 'Bitcoin', x = '') +
  scale_y_continuous(breaks = c(0, 10000, 20000, 30000, 40000, 50000, 60000), 
                     labels = c('$0', '$10,000', '$20,000', '$30,000', '$40,000', '$50,000', '$60,000')) + my_theme)
# after transformation
ggplotly(stock %>%
           mutate(Price = BoxCox(stock$Adj.Close, lambda = BoxCox.lambda(stock$Adj.Close))) %>%
           ggplot(aes(Date, Adj.Close)) + geom_line(col = '#ffa500') + 
  labs(title = 'Bitcoin', x = '', y = 'Price (Transformed)') + my_theme)

Model Fitting

bit_ts_tran = BoxCox(bit_ts, lambda = BoxCox.lambda(bit_ts))

ggAcf(diff(bit_ts_tran), lag.max = 200) + my_theme + labs(title = 'ACF' , y = 'Correlation') 

ggPacf(diff(bit_ts_tran), lag.max = 200) + my_theme + labs(title = 'PACF', y = '')

auto.arima(bit_ts_tran)
## Series: bit_ts_tran 
## ARIMA(3,1,1) with drift 
## 
## Coefficients:
##          ar1     ar2      ar3      ma1   drift
##       0.8974  0.0609  -0.0209  -0.9251  0.0012
## s.e.  0.1262  0.0346   0.0294   0.1235  0.0006
## 
## sigma^2 estimated as 0.000445:  log likelihood=3706.04
## AIC=-7400.08   AICc=-7400.02   BIC=-7368.13
checkresiduals(auto.arima(bit_ts_tran))

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,1,1) with drift
## Q* = 13.263, df = 5, p-value = 0.02103
## 
## Model df: 5.   Total lags used: 10

model again

cut2_bit_df = stock %>%
           filter(Date >= ymd('2019-01-01'))

ggplotly(cut2_bit_df %>%
           mutate(Price = BoxCox(cut2_bit_df$Adj.Close, lambda = BoxCox.lambda(cut2_bit_df$Adj.Close))) %>%
           ggplot(aes(Date,Adj.Close)) + geom_line(col = '#ffa500') + 
  labs(title = 'Bitcoin', x = '', y = 'Price (Transformed)') + my_theme)
bit_ts2 = stock %>%
  filter(Date >= as.Date('2019-01-01')) %>%
  arrange(Date) %>%
  select(Adj.Close) %>%
  as.matrix() %>%
  ts()

bit_ts_tran2 = BoxCox(bit_ts2, lambda = BoxCox.lambda(bit_ts2))

ggAcf(bit_ts_tran2, lag.max = 200) + my_theme + labs(title = 'ACF' , y = 'Correlation') 

ggPacf(bit_ts_tran2, lag.max = 200) + my_theme + labs(title = 'PACF', y = '')

auto.arima(bit_ts_tran2)
## Series: bit_ts_tran2 
## ARIMA(1,1,1) with drift 
## 
## Coefficients:
##           ar1     ma1   drift
##       -0.8011  0.7390  0.0012
## s.e.   0.1624  0.1827  0.0005
## 
## sigma^2 estimated as 0.0002113:  log likelihood=2220.29
## AIC=-4432.58   AICc=-4432.53   BIC=-4413.9
checkresiduals(auto.arima(bit_ts_tran2))

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,1) with drift
## Q* = 8.0078, df = 7, p-value = 0.3319
## 
## Model df: 3.   Total lags used: 10
autoplot(forecast(auto.arima(bit_ts_tran2)))

cut2_bit_df = stock %>%
           filter(Date >= ymd('2018-01-01') & Date <= ymd('2019-01-01')) 

ggplotly(cut2_bit_df %>%
           mutate(Price = BoxCox(cut2_bit_df$Adj.Close, lambda = BoxCox.lambda(cut2_bit_df$Adj.Close))) %>%
           ggplot(aes(Date,Adj.Close)) + geom_line(col = '#ffa500') + 
  labs(title = 'Bitcoin', x = '', y = 'Price (Transformed)') + my_theme)
bit_ts2 = stock %>%
  filter(Date >= as.Date('2018-01-01') & Date <= as.Date('2019-01-01')) %>%
  arrange(Date) %>%
  select(Adj.Close) %>%
  as.matrix() %>%
  ts()

bit_ts_tran2 = BoxCox(bit_ts2, lambda = BoxCox.lambda(bit_ts2))

ggAcf(bit_ts_tran2, lag.max = 200) + my_theme + labs(title = 'ACF' , y = 'Correlation') 

ggPacf(bit_ts_tran2, lag.max = 200) + my_theme + labs(title = 'PACF', y = '')

auto.arima(bit_ts_tran2)
## Series: bit_ts_tran2 
## ARIMA(1,1,0) with drift 
## 
## Coefficients:
##           ar1   drift
##       -0.0414  -5e-04
## s.e.   0.0526   3e-04
## 
## sigma^2 estimated as 3.554e-05:  log likelihood=1353.01
## AIC=-2700.02   AICc=-2699.96   BIC=-2688.32
checkresiduals(auto.arima(bit_ts_tran2))

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,0) with drift
## Q* = 11.636, df = 8, p-value = 0.1682
## 
## Model df: 2.   Total lags used: 10
summary(Arima(bit_ts_tran2, order = c(1,1,1), include.drift = T))
## Series: bit_ts_tran2 
## ARIMA(1,1,1) with drift 
## 
## Coefficients:
##           ar1     ma1   drift
##       -0.3597  0.3093  -5e-04
## s.e.   0.4810  0.4852   3e-04
## 
## sigma^2 estimated as 3.559e-05:  log likelihood=1353.23
## AIC=-2698.46   AICc=-2698.35   BIC=-2682.86
## 
## Training set error measures:
##                        ME        RMSE         MAE          MPE     MAPE
## Training set 1.242138e-05 0.005933249 0.004201422 0.0001799258 0.108142
##                  MASE      ACF1
## Training set 1.007819 0.0153384
err = residuals(Arima(bit_ts_tran2, order = c(1,1,1), include.drift = T))
invers_BoxCox = function(ts_data, lambda){
  original_ts = (ts_data * lambda + 1) ** (1/lambda)
  return(original_ts)
}

invers_BoxCox(sd(err), BoxCox.lambda(bit_ts))
## [1] 1.00596

text data

# load the dataset
news = read.csv("BitcoinNews_2021_1.csv", fileEncoding = "UTF-8", stringsAsFactors=FALSE)
news$Date = as.Date(news$Date, format =  "%m/%d/%Y")
head(news,6)
##         Date     Source
## 1 2021-02-07   bbc-news
## 2 2021-02-08   engadget
## 3 2021-02-08 techcrunch
## 4 2021-02-08       None
## 5 2021-02-08 techcrunch
## 6 2021-02-08    reuters
##                                                                            Title
## 1                                     FCA warning over risky TikTok trading tips
## 2                           Tesla buys in Bitcoin will soon accept it as payment
## 3 Tesla buys B in bitcoin may accept the cryptocurrency as payment in the future
## 4        Impeachment Trial Child Tax Credit Bitcoin Your Monday Evening Briefing
## 5                                      Daily Crunch DoorDash acquires Chowbotics
## 6                    FOREX Dollar steadies after U S jobs related losses Reuters
##                                                                                                                                                                                                                            Headline
## 1                                                                                                                                                      Some TikTok creators have been giving investment advice wake GameStop frenzy
## 2                Elon Musk cryptocurrency hype more than just idle talk CNBC reports that Tesla only bought billion worth Bitcoin help diversify maximize returns will start taking payments using digital asset sometime near futu
## 3                                                Today filing Tesla disclosed that acquired billion bitcoin popular cryptocurrency Moreover company noted that also accept bitcoin future form payment cars though allow that there
## 4                                                                                                                                                                                                               Here what need know
## 5                         DoorDash acquires salad making robotics startup Twitter confirms subscription plans Tesla makes bitcoin This your Daily Crunch February story DoorDash acquires Chowbotics DoorDash acquired Area startup
## 6 Dollar index little changed after Friday payrolls fall Jobs data takes shine dollar rebound Ethereum gains futures debut Bitcoin hits record high after Tesla purchase Graphic World rates https tmsnrt RBWI Adds details Bitcoin
news_text = news$Headline

# cleaning
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
news_source = Corpus(VectorSource(as.vector(news_text))) 
news_corpus = tm_map(news_source, tolower)  #transfer to lowercase
## Warning in tm_map.SimpleCorpus(news_source, tolower): transformation drops
## documents
news_corpus = tm_map(news_corpus, removeNumbers) # remove numbers
## Warning in tm_map.SimpleCorpus(news_corpus, removeNumbers): transformation drops
## documents
news_corpus = tm_map(news_corpus, removePunctuation) # remove punctuation 
## Warning in tm_map.SimpleCorpus(news_corpus, removePunctuation): transformation
## drops documents
news_corpus = tm_map(news_corpus, stripWhitespace) # strip white spaces
## Warning in tm_map.SimpleCorpus(news_corpus, stripWhitespace): transformation
## drops documents
news_corpus = tm_map(news_corpus, stemDocument, language = "english") # remove common word endings
## Warning in tm_map.SimpleCorpus(news_corpus, stemDocument, language = "english"):
## transformation drops documents
news_corpus = tm_map(news_corpus, removeWords, stopwords("english")) # remove useless words
## Warning in tm_map.SimpleCorpus(news_corpus, removeWords, stopwords("english")):
## transformation drops documents

News Sentiment Analysis

news_words <- news %>%
  select(c("Date","Source", "Title", "Headline")) %>%
  unnest_tokens(word, Headline) %>%
  filter(!word %in% append(stop_words$word, values = "chars"), str_detect(word, "^[a-z']+$"))
news_words$date = news_words$Date

words_only <- news_words %>%
  count(word, sort =TRUE)

set.seed(1)
wordcloud(words = words_only$word, freq = words_only$n, scale=c(5,.5), max.words=50, colors=brewer.pal(8, "Dark2"))

wordcloud

## Build a term-document matrix
news_dtm =DocumentTermMatrix(news_corpus)
news_dtm = as.matrix(news_dtm)

## Print out the top 10 most frequent words
WordFreq = colSums(news_dtm)
ord = order(WordFreq)
WordFreq[tail(ord,10)]
##       cnbc       elon       musk       high     record    billion     invest 
##         21         21         25         25         29         37         38 
## cryptocurr      tesla    bitcoin 
##         45         76        189
## Print out the number of words in each document
Row_Sum_Per_doc = rowSums(news_dtm)
print (Row_Sum_Per_doc)
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   8  29  21   2  24  32  23  21  16  13  19  19  22  23  22  24  23  23  25  23 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##  26  11  11  10  22  12  14  14  14  12  22  22  16  12  26  26  26  25   7   5 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##  19  10  12  26  16  24  24  22   9   7  26  26  24  24   9  24  22  20  24  13 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##  22  11  11   6  18  10  23  14  14  14  15  15  18  23  23   4  24  22  24  22 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##  25  24  25  26  23  27  12  12  25   8   5  21  21  12  19  18  14  24  14  27 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##  12  16  21  24  23  26   8  18  26  13  24  24   9  23  27  12  21  24   8  22 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##  24  21  10  19  23   6   6  10  21   5  15  15  26  26   8  27  21  26  26  12 
## 141 142 143 144 145 146 
##   5  12   8  15  10   5
#wordcloud 
library(wordcloud) 
v = sort(colSums(news_dtm),decreasing=TRUE) # get word frequency in Hamilton files
d = data.frame(word = names(v),freq=v) # create a data frame 
write.csv(d, file="words.csv")
head(d,10)
##                  word freq
## bitcoin       bitcoin  189
## tesla           tesla   76
## cryptocurr cryptocurr   45
## invest         invest   38
## billion       billion   37
## record         record   29
## musk             musk   25
## high             high   25
## cnbc             cnbc   21
## elon             elon   21
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 5,
          max.words=200, random.order=FALSE, rot.per=0.35,
          colors=brewer.pal(8, "Dark2"))

News Sentiment over Time

library(textdata)
afinn <- get_sentiments("afinn")

sentiment_summary <- news_words %>%
  left_join(afinn) %>%
  filter(!is.na(value)) %>%
  group_by(Title, Date) %>%
  summarise(score = mean(value)) %>%
  mutate(sentiment = ifelse(score>0, "positive","negative")) 
## Joining, by = "word"
## `summarise()` regrouping output by 'Title' (override with `.groups` argument)
datatable(sentiment_summary)
# plot
ggplot(sentiment_summary, aes(Date, score)) + 
  geom_bar(stat = "identity", aes(fill=sentiment))  + 
  ggtitle("Bitcoin: News Sentiment Over Time")